AddFusion
逐元素求和,支持广播和激活函数
- 其中 activation 可选:
0: 无激活,output = input0 + input1
1: ReLU激活,output = max(0, input0 + input1)
2: ReLU6激活,output = min(max(0, input0 + input1), 6)
- 输入:
input0 - 第一个输入数据地址。
input1 - 第二个输入数据地址。
output - 输出数据地址。
- param - 参数数组(8个元素),包含以下内容:
param[0]: input0_dims 地址(根据维度数量,分配相应空间)
param[1]: input1_dims 地址(根据维度数量,分配相应空间)
param[2]: output_dims 地址(根据维度数量,分配相应空间)
param[3]: strides0 地址(需分配 8*sizeof(int) 空间)
param[4]: strides1 地址(需分配 8*sizeof(int) 空间)
param[5]: strides_output 地址(需分配 8*sizeof(int) 空间)
param[6]: num_dims(维度数量,最大支持8维)
param[7]: activation_type(激活类型:0=无激活, 1=ReLU, 2=ReLU6)
core_mask - 核掩码(仅共享存储版本)。
- 输出:
output - 计算结果地址。
- 支持平台:
FT78NEMT7004
备注
FT78NE 支持int8, int16, int32, fp32, fp64, cplx64, cplx128
MT7004 支持fp16, fp32, int16, int32, cplx64
cplx64 和 cplx128 不支持激活函数(activation_type 参数无效,始终执行普通加法)
最大支持 8 维张量
- 功能说明:
支持同形状加法(两个输入形状相同)
支持标量广播加法(一个输入为标量)
支持复杂广播加法(任意形状广播,最大8维)
共享存储版本:
-
void i8_add_s(int8_t *input0, int8_t *input1, int8_t *output, unsigned long long *param, int core_mask)
-
void i16_add_s(int16_t *input0, int16_t *input1, int16_t *output, unsigned long long *param, int core_mask)
-
void i32_add_s(int *input0, int *input1, int *output, unsigned long long *param, int core_mask)
-
void hp_add_s(half *input0, half *input1, half *output, unsigned long long *param, int core_mask)
-
void fp_add_s(float *input0, float *input1, float *output, unsigned long long *param, int core_mask)
-
void dp_add_s(double *input0, double *input1, double *output, unsigned long long *param, int core_mask)
-
void c64_add_s(float *input0, float *input1, float *output, unsigned long long *param, int core_mask)
-
void c128_add_s(double *input0, double *input1, double *output, unsigned long long *param, int core_mask)
C调用示例:
1//FT78NE示例
2#include <stdio.h>
3#include <add.h>
4
5#define MAX_DIMS 8
6
7int main(int argc, char* argv[]) {
8 float *input0 = (float *)0xA0000000; // input在DDR空间
9 float *input1 = (float *)0xA0001000;
10 float *output = (float *)0xA0002000;
11
12 int input0_dims[MAX_DIMS] = {1024, 1024};
13 int input1_dims[MAX_DIMS] = {1024, 1024};
14 int output_dims[MAX_DIMS] = {1024, 1024};
15 int strides0[MAX_DIMS], strides1[MAX_DIMS], strides_out[MAX_DIMS];
16 int num_dims = 2;
17 int activation_type = 0; // 无激活
18
19 unsigned long long param[8];
20 param[0] = (unsigned long long)input0_dims;
21 param[1] = (unsigned long long)input1_dims;
22 param[2] = (unsigned long long)output_dims;
23 param[3] = (unsigned long long)strides0;
24 param[4] = (unsigned long long)strides1;
25 param[5] = (unsigned long long)strides_out;
26 param[6] = (unsigned long long)num_dims;
27 param[7] = (unsigned long long)activation_type;
28
29 int core_mask = 0xff;
30 fp_add_s(input0, input1, output, param, core_mask);
31 return 0;
32}
私有存储版本:
-
void i8_add_p(int8_t *input0, int8_t *input1, int8_t *output, unsigned long long *param)
-
void i16_add_p(int16_t *input0, int16_t *input1, int16_t *output, unsigned long long *param)
-
void i32_add_p(int *input0, int *input1, int *output, unsigned long long *param)
-
void hp_add_p(half *input0, half *input1, half *output, unsigned long long *param)
-
void fp_add_p(float *input0, float *input1, float *output, unsigned long long *param)
-
void dp_add_p(double *input0, double *input1, double *output, unsigned long long *param)
-
void c64_add_p(float *input0, float *input1, float *output, unsigned long long *param)
-
void c128_add_p(double *input0, double *input1, double *output, unsigned long long *param)
C调用示例:
1//FT78NE示例
2#include <stdio.h>
3#include <add.h>
4
5#define MAX_DIMS 8
6
7int main(int argc, char* argv[]) {
8 float *input0 = (float *)0x10800000; // input在L2空间
9 float *input1 = (float *)0x10801000;
10 float *output = (float *)0x10802000;
11
12 int input0_dims[MAX_DIMS] = {1024, 1024};
13 int input1_dims[MAX_DIMS] = {1024, 1024};
14 int output_dims[MAX_DIMS] = {1024, 1024};
15 int strides0[MAX_DIMS], strides1[MAX_DIMS], strides_out[MAX_DIMS];
16 int num_dims = 2;
17 int activation_type = 1; // ReLU激活
18
19 unsigned long long param[8];
20 param[0] = (unsigned long long)input0_dims;
21 param[1] = (unsigned long long)input1_dims;
22 param[2] = (unsigned long long)output_dims;
23 param[3] = (unsigned long long)strides0;
24 param[4] = (unsigned long long)strides1;
25 param[5] = (unsigned long long)strides_out;
26 param[6] = (unsigned long long)num_dims;
27 param[7] = (unsigned long long)activation_type;
28
29 fp_add_p(input0, input1, output, param);
30 return 0;
31}